QSRR revisited. Hierarchical models

Author
Affiliation

Paweł Wiczling*

Department of Biopharmaceutics and Pharmacodynamics, Medical University of Gdańsk, Gen. J. Hallera 107, 80-416 Gdańsk, Poland

Published

April 30, 2025

Introductions

Setup

Code
library(ggplot2)
library(gridExtra)
library(patchwork)
library(tidyr)
library(dplyr)
library(GGally)
library(reshape2)
library(pracma)
library(here)
library(magick)
library(reticulate)
library(kableExtra)
require(visNetwork, quietly = TRUE)
library(igraph)
library(ggraph)
#remotes::install_github("metrumresearchgroup/mrgmisc")
library(tidyverse)
library(patchwork)

set.seed(10271998) ## not required but assures repeatable results

source("helper-functions.R")
Code
data_deliv_dir = here::here("data","deliv")
data_derived_dir = here::here("data","derived")
if(!file.exists(data_deliv_dir)) dir.create(data_deliv_dir, recursive = T)
if(!file.exists(data_derived_dir)) dir.create(data_derived_dir, recursive = T)

figures_eda_dir <- here::here("deliv","figures", "eda")
tables_eda_dir <- here::here("deliv","tables", "eda")
if(!file.exists(figures_eda_dir)) dir.create(figures_eda_dir, recursive = T)
if(!file.exists(tables_eda_dir)) dir.create(tables_eda_dir, recursive = T)

model_dir <- here::here("model","stan")  
figures_dir <- here::here("deliv","figures", "stan")
tables_dir <- here::here("deliv","tables", "stan")
if(!file.exists(figures_dir)) dir.create(figures_dir, recursive = T)
if(!file.exists(model_dir)) dir.create(model_dir, recursive = T)
if(!file.exists(tables_dir)) dir.create(tables_dir, recursive = T)

EDA

We used a publicly available dataset that comprises the measurements of RP-HPLC retention times collected for 1026 analytes. The retention times were measured under isocratic conditions on Eclipse Plus C18 (Agilent) stationary phase with 3.5 μm particles. The experiments were conducted using a mixture of two solvents: solvent A, which was made of 0.1% formic acid in water, and solvent B, which was made of 0.1% formic acid in acetonitrile. The column temperature was set at 35^{}C. The data were collected by Boswell et al. and were used to create a method to predict retention time by Back-Calculating the Gradient.

Load data

Code
data <- read.csv(here(data_deliv_dir, "database_logk_1026.csv"), header = TRUE)
analytes_names  <- read.csv(here(data_deliv_dir,"database_logk_1026_analyte_names.csv"), header = TRUE)
smiles <- read.csv(here(data_deliv_dir,"smiles1026.smi"), sep = "\t", header = FALSE)
functional_groups = read.csv(here(data_deliv_dir, 'checkmol_functional_groups.csv'))
functional_groups_names = read.csv(here(data_deliv_dir,'checkmol_functional_group_names.csv'))
data_ACD = read.csv(here(data_deliv_dir,'ACD_pKas.csv'))
data_pH <- read.csv(here(data_deliv_dir,"pH.csv"),header = TRUE)
results <-read.csv(here(data_deliv_dir,"results_table_max_frag_size.csv"))
comparison_table = read.csv(file = here(data_deliv_dir,"comparison_table.csv"), header = TRUE)
smiles<-smiles %>% rename(ID=V2,smiles=V1) %>% select(ID,smiles)
smiles$smiles[905] = "CN(C1CCCCC1N1CCCC1)C(=O)Cc1ccc(c(c1)Cl)Cl" # remove tartrate moiety 
smiles$smiles[425] = "CC(Cc1ccc(cc1)OCC(=O)O)NCC(c1cccc(c1)Cl)O" # remove Na+ and dissociation

analytes_names$Analyte <- iconv(analytes_names$Analyte, from = "", to = "UTF-8", sub = "byte")

Python

Code
# use_condaenv("rdkit-env", conda = "C:/Users/GUMed/anaconda3/Scripts/conda.exe", required = TRUE)
#.Renviron
Code
Chem <- import("rdkit.Chem")
AllChem <- import("rdkit.Chem.AllChem")
Draw <- import("rdkit.Chem.Draw")
rdFMCS <- import("rdkit.Chem.rdFMCS")
rdmolops <- import("rdkit.Chem.rdmolops")

# Create MCS parameters
mcs_params <- rdFMCS$MCSParameters()
mcs_params$AtomCompareParameters$MatchValences <- TRUE
mcs_params$AtomCompareParameters$RingMatchesRingOnly <- TRUE
mcs_params$AtomCompareParameters$CompleteRingsOnly <- TRUE
mcs_params$AtomCompare <- rdFMCS$AtomCompare$CompareElements
mcs_params$BondCompare <- rdFMCS$BondCompare$CompareOrderExact
mcs_params$BondCompareParameters$CompleteRingsOnly <- TRUE
mcs_params$BondCompareParameters$RingMatchesRingOnly <- TRUE
mcs_params$Timeout <- 20L
mcs_params$Verbose <- FALSE
mcs_params$Threshold <- 1.0
mcs_params$MaximizeBonds<-TRUE

Maximum Common Substructure

Code
library(reticulate)
library(future.apply)

smiles_vec <- smiles$smiles  
pairs <- combn(seq_along(smiles_vec), 2, simplify = FALSE)
plan(multisession, workers = 20)  

results_list <- future_lapply(pairs, function(idx_pair) {
  
  library(reticulate)
  library(here)
  source_python(here("scripts/mcs_single_maxfragment.py"))  # re-import inside worker
  
  i <- idx_pair[1]
  j <- idx_pair[2]
  
  mcs_size <- compute_mcs_and_fragment_diff(smiles_vec[i], smiles_vec[j])
  
  list(mol1_index = i, mol2_index = j, mcs_size = mcs_size)
}, future.seed = TRUE)

# Bind all results
results <- do.call(rbind, lapply(results_list, as.data.frame))
write.csv(results, file = here(data_deliv_dir,"results_table_max_frag_size.csv"), row.names = FALSE)

Subsets

Code
results_selected <- results %>%
  mutate(crit = mcs_size)%>%
  group_by(mol1_index) %>%
  slice(which.min(crit))

MCS Subsets

Code
# Extract the vectors
rows <- smiles$smiles[results_selected$mol1_index]
cols <- smiles$smiles[results_selected$mol2_index]

comparison_table <- purrr::map2_df(rows, cols, compare_smiles_pair,
                                   .progress = list(
  type = "iterator", 
  format = "Calculating {cli::pb_bar} {cli::pb_percent} {cli::pb_current}/{cli::pb_total}",
  clear = FALSE))

write.csv(comparison_table, file = here(data_deliv_dir,"comparison_table.csv"), row.names = FALSE)

Similarity of MCS

Code
# Extract the vectors
rows <- comparison_table$common1
cols <- comparison_table$common2

comparison_table$are_similar <- as.double(purrr::map2_lgl(rows, cols, are_smiles_identical))

Add membership groups

Code
results_selected <- results_selected %>% select(-any_of("group"))
results_selected_crit<-results_selected %>% filter(crit<7) 

uid = unique(c(results_selected_crit$mol1_index,results_selected_crit$mol2_index))
nodes <- data.frame(id = uid,label = paste(uid))
edges <- data.frame(from = results_selected_crit$mol1_index,
                    to = results_selected_crit$mol2_index, 
                    width =results_selected_crit$crit,
                    label = paste(results_selected_crit$crit))
g <- graph_from_data_frame(d = edges, vertices = nodes, directed = FALSE)
components <- components(g, mode = "weak")
nodes$group <- components$membership

nodes$atom_count <- unlist(lapply(smiles$smiles[nodes$id], get_atom_count)) 

results_selected <- results_selected%>%
  mutate(id=mol1_index) %>%
  left_join(nodes) %>%
   select(-c(label, id)) %>%
  mutate(group = ifelse(is.na(group), 0, group))

tabulate(results_selected$group+1)
 [1] 299  16 169  23   9  38  26   2   4 118   6   7   7   2   2  34  12  15   2
[20]   3   2   2   2   2   3   3  15   2   3   3   2   3   2   5   3   2   3   3
[39]   2   2   2   9   3   6  15   8   3   2   2   2   3  11   3   2   5   2   4
[58]   4   3   2   3   2   4   2   2   2   2   3   3   4   2   2   2   3   2   2
[77]   2   2   3   2   2   2   2   2   3   4   6   2   2   2

Visualise Network

Code
visNetwork(nodes, edges, width = "100%")

Orient graph

Code
orient_graph_no_multiple_incoming <- function(g) {
  stopifnot(!is_directed(g))
  
  degs <- degree(g)
  node_order <- names(sort(degs, decreasing = TRUE))
  
  g_directed <- make_empty_graph(n = vcount(g), directed = TRUE)
  V(g_directed)$name <- V(g)$name
  
  has_parent <- setNames(rep(FALSE, vcount(g)), V(g)$name)
  
  for (node in node_order) {
    neighbors_node <- neighbors(g, node, mode = "all")
    
    for (nbr in neighbors_node) {
      nbr_name <- V(g)[nbr]$name
      
      # Only consider edge if neighbor has no incoming edge
      if (!has_parent[nbr_name] && node != nbr_name) {
        
        # Check if adding edge would introduce a cycle
        temp_graph <- add_edges(g_directed, c(node, nbr_name))
        if (is_dag(temp_graph)) {
          g_directed <- temp_graph
          has_parent[nbr_name] <- TRUE
        }
      }
    }
  }
  
  return(g_directed)
}

# Apply it
g_directed <- orient_graph_no_multiple_incoming(g)

Visualise Network

Code
# Create nodes data frame
nodes <- data.frame(
  id = V(g_directed)$name,
  label = V(g_directed)$name,
  stringsAsFactors = FALSE
)

components <- components(g, mode = "weak")
nodes$group <- components$membership

# Create edges data frame
edges <- igraph::as_data_frame(g_directed, what = "edges")
colnames(edges) <- c("from", "to")

#nodes with no outgoing edges
nodes_noin <- V(g_directed)[degree(g_directed, mode = "in") == 0]
nodes_noin <- names(nodes_noin)

visNetwork(nodes, edges, width = "100%") %>%
  visEdges(arrows = "to")

Equation output:

Code
topo_order <- topo_sort(g_directed, mode = "out")
topo_order <- names(topo_order)

# Initialize a named list to store equations
equations <- setNames(vector("list", length(topo_order)), topo_order)

for (node in sorted_nodes) {
  parents <- names(neighbors(g_directed, node, mode = "in"))
  
  if (length(parents) == 0) {
    # If no parents, it's a root node
    equations[[node]] <- node
  } else {
    # Sum of parents
    equations[[node]] <- paste0(node, " = ", paste(parents, collapse = " + "))
  }
}

# Print all equations
equations_vec <- unlist(equations)
cat(paste(equations_vec, collapse = "\n"))

Plot

Code
plotting_i = function(i, .ilist) {
  data %>% 
    filter(ID %in% c(.ilist$id1[i], .ilist$id2[i])) %>%
    ggplot(aes(x = fi, y = logk, group = ID)) + 
    geom_line(aes(color = logP_ACD)) + 
    labs(
      title = paste(analytes_names$Analyte[.ilist$id1[i]], "\n", analytes_names$Analyte[.ilist$id2[i]]), 
      x = "\u03C6", 
      y = expression(log~k[obs])
    ) + 
    theme_gray(base_size = 10) + 
    theme(legend.position = "none")+
    theme(plot.title = element_text(size = 5), 
          axis.title = element_text(size = 7),
          axis.text = element_text(size = 5))
}

list0 = comparison_table%>% filter(similarity==0)
list1 = comparison_table%>% filter(similarity==1)
list2 = comparison_table%>% filter(similarity==2)
list10 = comparison_table%>% filter(similarity==10)
map(1:min(20, nrow(list0)), \(x) plotting_i(x, list0)) %>% wrap_plots(ncol = 4)

Code
map(1:min(20, nrow(list1)), \(x) plotting_i(x, list1)) %>% wrap_plots(ncol = 4)

Code
map(1:min(20, nrow(list2)), \(x) plotting_i(x, list2)) %>% wrap_plots(ncol = 4)

Code
map(1:min(20, nrow(list10)), \(x) plotting_i(x, list10)) %>% wrap_plots(ncol = 4)

Test

Code
# Extract the vectors

idx = 131
smiles1 <- smiles$smiles[results_selected$mol1_index[idx]]
smiles2 <- smiles$smiles[results_selected$mol2_index[idx]]

comparison_table_x <- purrr::map2_df(smiles1, smiles2, compare_smiles_pair)
comparison_table_x$id1 = results_selected$mol1_index[idx]
comparison_table_x$id2 = results_selected$mol2_index[idx]

comparison_table_x%>%
  select("id1", "id2", "structure") %>%
  kbl(escape = FALSE, format = "html", align = "l") %>%
  kable_styling(full_width = FALSE, position = "center", bootstrap_options = c("striped", "hover", "condensed"))

comparison_table_x

Session info

Code
sessionInfo()
R version 4.3.1 (2023-06-16 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19045)

Matrix products: default


locale:
[1] LC_COLLATE=Polish_Poland.utf8  LC_CTYPE=Polish_Poland.utf8   
[3] LC_MONETARY=Polish_Poland.utf8 LC_NUMERIC=C                  
[5] LC_TIME=Polish_Poland.utf8    

time zone: Europe/Warsaw
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] lubridate_1.9.3   forcats_1.0.0     stringr_1.5.1     purrr_1.0.2      
 [5] readr_2.1.5       tibble_3.2.1      tidyverse_2.0.0   ggraph_2.2.1     
 [9] igraph_2.1.4      visNetwork_2.1.2  kableExtra_1.4.0  reticulate_1.42.0
[13] magick_2.8.6      here_1.0.1        pracma_2.4.4      reshape2_1.4.4   
[17] GGally_2.2.1      dplyr_1.1.4       tidyr_1.3.1       patchwork_1.2.0  
[21] gridExtra_2.3     ggplot2_3.5.1    

loaded via a namespace (and not attached):
 [1] gtable_0.3.5       xfun_0.44          htmlwidgets_1.6.4  ggrepel_0.9.6     
 [5] lattice_0.21-8     tzdb_0.4.0         vctrs_0.6.5        tools_4.3.1       
 [9] generics_0.1.3     fansi_1.0.6        pkgconfig_2.0.3    Matrix_1.6-5      
[13] RColorBrewer_1.1-3 lifecycle_1.0.4    compiler_4.3.1     farver_2.1.2      
[17] textshaping_0.4.0  munsell_0.5.1      ggforce_0.4.2      graphlayouts_1.2.2
[21] htmltools_0.5.8.1  yaml_2.3.8         pillar_1.9.0       MASS_7.3-60       
[25] cachem_1.1.0       viridis_0.6.5      ggstats_0.8.0      tidyselect_1.2.1  
[29] digest_0.6.35      stringi_1.8.4      labeling_0.4.3     polyclip_1.10-7   
[33] rprojroot_2.0.4    fastmap_1.2.0      grid_4.3.1         colorspace_2.1-0  
[37] cli_3.6.2          magrittr_2.0.3     tidygraph_1.3.1    utf8_1.2.4        
[41] withr_3.0.2        scales_1.3.0       timechange_0.3.0   rmarkdown_2.27    
[45] hms_1.1.3          ragg_1.3.2         png_0.1-8          memoise_2.0.1     
[49] evaluate_1.0.3     knitr_1.46         viridisLite_0.4.2  rlang_1.1.3       
[53] Rcpp_1.0.12        glue_1.7.0         tweenr_2.0.3       xml2_1.3.6        
[57] svglite_2.1.3      rstudioapi_0.16.0  jsonlite_1.8.8     R6_2.5.1          
[61] plyr_1.8.9         systemfonts_1.1.0